In [1]:

    
import pandas as pd
import numpy as np
import sys
%matplotlib inline



In [2]:

    
print 'Python version ' + sys.version
print 'Pandas version ' + pd.__version__









    



Python version 2.7.5 |Anaconda 2.1.0 (64-bit)| (default, Jul  1 2013, 12:37:52) [MSC v.1500 64 bit (AMD64)]
Pandas version 0.15.2

Select

How do I select a random sample of a group?



In [3]:

    
# Initial dataframe
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
                            "c","a","a","a","b","b","b","b"],
                'group2' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
                'value'  : ["apple","pear","orange","apple",
                            "banana","durian","lemon","lime",
                            "raspberry","durian","peach","nectarine",
                            "banana","lemon","guava","blackberry","grape"]})
df



In [4]:

    
# We don't simply want to select random rows from df
# We want to first group df by (group1 & group2) then select random rows
from random import choice

# First create the group
grouped = df.groupby(['group1','group2'])
grouped.size()


#Notice that group (a,1) has two posibilities
#Notice that group (a,2) has one posibilities

#This means that if we select a random sample from group (a,1) we will get either "apple" or "peach"
#This means that if we select a random sample from group (a,2) we will always get "nectarine"









    Out[4]:





group1  group2
a       1         2
        2         1
        3         2
        4         1
b       1         2
        2         2
        3         1
        4         1
c       3         1
        4         1
        5         2
        6         1
dtype: int64



In [5]:

    
#df.loc[select a random record from each group]
df.loc[(choice(x) for x in grouped.groups.itervalues())]









    Out[5]:






  
    
      
      group1
      group2
      value

How do I slice each row of a column?



In [6]:

    
df = pd.DataFrame(data=['abcdef']*10, columns=['text'])
df



In [7]:

    
# Select the first 2 characters of each row
df['text'].apply(lambda x: x[:2])









    Out[7]:





0    ab
1    ab
2    ab
3    ab
4    ab
5    ab
6    ab
7    ab
8    ab
9    ab
Name: text, dtype: object

How can I select rows of my dataframe based on a "complex" filter applied to multiple columns?



In [8]:

    
d = {'Dates':[pd.Timestamp('2013-01-02'),
              pd.Timestamp('2013-01-03'),
              pd.Timestamp('2013-01-04')],
     'Num1':[1,2,3],
     'Num2':[-1,-2,-3]}
                 

df = pd.DataFrame(data=d)
df



In [9]:

    
# where all values in column "Num1" are positive
positive = df['Num1'] > 0

# where values in column "Num2" is equal to -1
negativeOne = df['Num2'] == -1

# where values in the column "Dates" are in (1/2/2013 or 1/20/2013)
Dates = df['Dates'].isin(['2013-01-02','2013-01-20'])

df[positive & negativeOne & Dates]

How to get the maximum value of a group?



In [10]:

    
df = pd.DataFrame({'col1':['minus','minus','positive','nan'],
                'col2':[10,20,30,40],
                'col3':[-10,-20,30,np.nan]
                })
df



In [11]:

    
# Method 1
df.groupby('col1').apply(lambda x: x.max())



In [12]:

    
# Method 2
df.groupby('col1').agg('max')

How to select records from one level of a multi-index data frame?



In [13]:

    
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
                            "c","a","a","a","b","b","b","b"],
                'value' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
                'group2'  : ["apple","pear","orange","apple",
                            "banana","durian","lemon","lime",
                            "raspberry","durian","peach","nectarine",
                            "banana","lemon","guava","blackberry","grape"]})
df = df.set_index(['group1','group2'])
df



In [14]:

    
df.xs('a', level='group1')

How do I reset the index when the index names are the same as the column names?



In [15]:

    
df = pd.DataFrame({"Name":["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] , 
                "City":["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"]}
               )
df









    Out[15]:






  
    
      
      City
      Name
    
  
  
    
      0
        Seattle
         Alice
    
    
      1
        Seattle
           Bob
    
    
      2
       Portland
       Mallory
    
    
      3
        Seattle
       Mallory
    
    
      4
        Seattle
           Bob
    
    
      5
       Portland
       Mallory



In [16]:

    
group = df.groupby(['City','Name'])
s = group.agg('size')
s.add_suffix('_size').reset_index()









    Out[16]:






  
    
      
      City
      Name
      0
    
  
  
    
      0
       Portland_size
       Mallory_size
       2
    
    
      1
        Seattle_size
         Alice_size
       1
    
    
      2
        Seattle_size
           Bob_size
       2
    
    
      3
        Seattle_size
       Mallory_size
       1

Author: David Rojas

	group1	group2	value
0	a	1	apple
1	b	2	pear
2	a	3	orange
3	a	4	apple
4	b	1	banana
5	c	3	durian
6	c	5	lemon
7	c	6	lime
8	c	5	raspberry
9	c	4	durian
10	a	1	peach
11	a	2	nectarine
12	a	3	banana
13	b	4	lemon
14	b	3	guava
15	b	2	blackberry
16	b	1	grape

	text
0	abcdef
1	abcdef
2	abcdef
3	abcdef
4	abcdef
5	abcdef
6	abcdef
7	abcdef
8	abcdef
9	abcdef

		value
group1	group2
a	apple	1
b	pear	2
a	orange	3
a	apple	4
b	banana	1
c	durian	3
	lemon	5
	lime	6
	raspberry	5
	durian	4
a	peach	1
	nectarine	2
	banana	3
b	lemon	4
	guava	3
	blackberry	2
	grape	1

	City	Name
0	Seattle	Alice
1	Seattle	Bob
2	Portland	Mallory
3	Seattle	Mallory
4	Seattle	Bob
5	Portland	Mallory

	City	Name	0
0	Portland_size	Mallory_size	2
1	Seattle_size	Alice_size	1
2	Seattle_size	Bob_size	2
3	Seattle_size	Mallory_size	1